import string
from random import random

from pyspark.sql import *
from pyspark.sql.types import *
import pyspark.sql.functions as f

if __name__ == '__main__':
    # spark=SparkSession.builder.appName("test16_udf_define").master("local[*]").getOrCreate()
    # sc=spark.sparkContext

    spark=SparkSession.builder.appName("test1_dataFrame_create")\
        .master("local[*]").getOrCreate()
    sc=spark.sparkContext
    # [] 当作一行
    rdd = sc.parallelize([
        ('张三', 'class_1', 99),
        ('王五', 'class_2', 35),
        ('王三', 'class_3', 57),
        ('王久', 'class_4', 12),
        ('王丽', 'class_5', 99),
        ('王娟', 'class_1', 90),
        ('王军', 'class_2', 91),
        ('王俊', 'class_3', 33),
        ('王君', 'class_4', 55),
        ('王珺', 'class_5', 66),
        ('郑颖', 'class_1', 11),
        ('郑辉', 'class_2', 33),
        ('张丽', 'class_3', 36),
        ('张张', 'class_4', 79),
        ('黄凯', 'class_5', 90),
        ('黄开', 'class_1', 90),
        ('黄恺', 'class_2', 90),
        ('王凯', 'class_3', 11),
        ('王凯杰', 'class_1', 11),
        ('王开杰', 'class_2', 3),
        ('王景亮', 'class_3', 99)
    ])

    schema=StructType().add("name",StringType()).add("class",StringType()).add("score",IntegerType())
    df=spark.createDataFrame(rdd,schema)
    df.createOrReplaceTempView("stu")

    spark.sql("""
        select *,avg(score) over() as avg_score from stu
    """).show()

    spark.sql("""
    select *,row_number() over(order by score desc) as ro
        ,dense_rank() over(partition by class order by score desc) as do
        ,rank() over(order by score) as roo
    from stu
    """).show()

    spark.sql("""
    select * ,ntile(6) over(order by score) from stu;
    """).show()

    spark.stop()









